TD DSA 2021 de Antoine Ly - rapport de Fabien Faivre
L'approche déployée consiste à analyser des tweets en langue anglaise et de prédire les sentiments qu'ils portent : {negative: -1, neutral: 0, positive: 1}
Dans cet exercice, la langue anglaise est un facteur facilitant dans la mesure où beaucoup de modèles préentrainés existent dans cette langue.
La difficulté dans cet exercice provient de sa source : les tweets. Les approches classiques reposent sur :
thx);-) ou pour marquer une émotion forte !!!En complément au sujet du TD lui-même, celui-ci a été l'occasion de monter en compétence avec les (je l'espère) bonnes pratiques de codage et l'utilisation de techniques de MLOps.
Le code de ce projet a été organisé en s'appuyant sur le framework open source orbyter de la société Manifold.ai. Ce framework pousse à la standardisation de la structure du code, via l'utilisation de cookiecutter et promeut un développement dans un environnement dockerisé dès le départ :

La logique de développement pronée est disponible ici
Plusieurs modifications ont dû être apportées aux paramètres du docker-compose pour permettre un accès aux ressources GPU depuis le docker.
Le code a été versionné et est disponible ici github
!pip install textblob
!pip install emot
!pip install wordcloud
#Temps et fichiers
import os
import warnings
import time
from datetime import timedelta
#Manipulation de données
import pandas as pd
import numpy as np
# Text
from collections import Counter
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.util import ngrams
from textblob import TextBlob
import string
import re
import spacy
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
#Modélisation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
#Evaluation
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_score, recall_score
#Visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
#Tracking d'expérience
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
!pip freeze > /mnt/docker/requirements.txt
#Cette cellule permet d'appeler la version packagée du projet et d'en assurer le reload avant appel des fonctions
%load_ext autoreload
%autoreload 2
from dsa_sentiment.scripts.make_dataset import load_data
from dsa_sentiment.scripts.evaluate import eval_metrics
from dsa_sentiment.scripts.make_dataset import Preprocess_StrLower, Preprocess_transform_target
mlflow.tracking.get_tracking_uri()
!pwd
data_folder = os.path.join('/mnt', 'data', 'raw')
all_raw_files = [os.path.join(data_folder, fname)
for fname in os.listdir(data_folder)]
all_raw_files
random_state=42
Il n'est pas possible de faire de l'imputation comme avec des champs numérique. Il convient donc de supprimer les tweets vides (dropNA=True).
X_train, y_train = load_data(all_raw_files[2], split=False, random_state=random_state, dropNA=True)
X_train.head()
print(f'le jeu d\'entraînement initial contient', X_train.shape[0] , 'lignes')
y_train.head()
X_test, y_test = load_data(all_raw_files[1], split=False, random_state=random_state, dropNA=True)
X_test.head()
print(f'le jeu de test contient', X_test.shape[0] , 'lignes')
Cette partie vise uniquement à sélectionner les colonnes dont nous nous servirons et à transcoder la cible au format souhaité.
# Dans ce projet on ne se servira que du champs `text`. On cherche toutefois à conserver le format pandas DataFrame
X_train = X_train[['text']]
#X_val = X_val[['text']]
X_test = X_test[['text']]
X_train.head()
On commence par transformer les cibles pour se conformer aux instructions
X_train.to_parquet('/mnt/data/interim/X_train.gzip',compression='gzip')
X_test.to_parquet('/mnt/data/interim/X_test.gzip',compression='gzip')
On commence par nalyser l'équilibre des différentes classes de sentiments
df = pd.concat([X_train, y_train], axis=1)
df.head()
fig = px.histogram(df, x="sentiment", color="sentiment", title = 'Nombre de tweets par sentiment')
fig.show()
Il existe un léger déséquilibre dans les classes en faveur des sentiments neutral
Pour la suite des travaux, on créée un corpus contenant la concaténation de tous les tweets d'une certaine tonalité.
def create_corpus(text_series):
text = text_series.apply(lambda x : x.split())
text = sum(text, [])
return text
positive_text = create_corpus(df['text'][df['sentiment']=='positive'])
negative_text = create_corpus(df['text'][df['sentiment']=='negative'])
neutral_text = create_corpus(df['text'][df['sentiment']=='neutral'])
Il devient alors possible de crééer des histogrammes représentant la fréquence de N-grams dans un corpus =donné
def plot_freq_dist(text_corpus, nb=30, ngram=1, title=''):
'''
Plot the most common words
inputs:
text_corpus : a corpus of words
nb : number of words to plot
title : graph title
returns:
nothing, plots the graph
'''
freq_pos=Counter(ngrams(create_corpus(pd.Series(text_corpus)),ngram))
pos_df = pd.DataFrame({
"words":[' '.join(items) for items in list(freq_pos.keys())],
"Count":list(freq_pos.values())
})
common_pos= pos_df.nlargest(columns="Count", n=30)
fig = px.bar(common_pos, x="words", y="Count", labels={"words": "Words", "Count":"Frequency"}, title=title)
fig.show();
plot_freq_dist(positive_text, title = 'Most common words associated with positive tweets')
Le résultat montre la prépondérance des stopwords, ces mots d'articulation, qui sont très communs et gènent l'identifiaction de mots clefs propres à un document / ensemble de documents spécifiques.
Il convient donc d'effectuer des opérations de retraitement du texte pour analyse.
Parmi les éléments propres aux tweets qui peuvent avoir un impact sur la suite on compte :
#@!!!!, looooong, ou l'autocensure f***Afin de disposer de traitements homogènes, repoductibles et paramétrables, une fonction spécifique est créée. Les différenst paramètres pourront être testés dans les phase de modélistaion ultérieures.
source preprocess
def preprocess_text(text_series,
apply_lemmatizer=True,
apply_lowercase=True,
apply_url_standerdisation=True,
apply_user_standerdisation=True,
apply_emoticon_to_words=True,
apply_stopwords_removal=True,
apply_shortwords_removal=True,
apply_non_alphabetical_removal=True,
apply_only_2_consecutive_charac=True
):
'''
Main preprocess function
inputs:
text_series : a pandas Series object with text to preprocess
outputs:
a preprocessed pandas Series object
'''
processedText = []
if apply_lemmatizer:
# Create Lemmatizer and Stemmer.
wordLemm = WordNetLemmatizer()
# Defining regex patterns.
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = '@[^\s]+'
alphaPattern = r"[^(\w|\*|(!){2}|#)]"
sequencePattern = r"(.)\1\1+"
seqReplacePattern = r"\1\1"
for tweet in text_series:
if apply_lowercase:
tweet = tweet.lower()
if apply_url_standerdisation:
# Replace all URls with 'URL'
tweet = re.sub(urlPattern,' URL',tweet)
if apply_user_standerdisation:
# Replace @USERNAME to 'USER'.
tweet = re.sub(userPattern,' USER', tweet)
if apply_emoticon_to_words:
# Replace all emojis.
for emo in EMOTICONS:
#refactor outputs so that we come up with a single word when/if text spliting afterwards
val = "_".join(EMOTICONS[emo].replace(",","").split())
val='EMO_'+val
tweet = tweet.replace(emo, ' '+val+' ')
for emot in UNICODE_EMO:
val = "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split())
val='EMO_'+val
tweet = tweet.replace(emo, ' '+val+' ')
if apply_only_2_consecutive_charac:
# Replace 3 or more consecutive letters by 2 letter.
tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
if apply_non_alphabetical_removal:
# Replace all non alphabets.
tweet = re.sub(alphaPattern, " ", tweet)
tweetwords = ''
for word in tweet.split():
# Checking if the word is a stopword.
if apply_stopwords_removal:
if word in stopwords.words('english'):
word=''
else:
word=word
#if word not in stopwordlist:
if apply_shortwords_removal:
if len(word)<=1:
word=''
else:
word=word
# Lemmatizing the word.
if apply_lemmatizer:
word = wordLemm.lemmatize(word)
else:
word=word
tweetwords += (word+' ')
processedText.append(tweetwords)
# return pd.Series(processedText)
return processedText
positive_text_2 = preprocess_text(df['text'][df['sentiment']=='positive'], apply_lemmatizer=False, apply_non_alphabetical_removal=True)
neutral_text_2 = preprocess_text(df['text'][df['sentiment']=='neutral'], apply_lemmatizer=False, apply_non_alphabetical_removal=True)
negative_text_2 = preprocess_text(df['text'][df['sentiment']=='negative'], apply_lemmatizer=False, apply_non_alphabetical_removal=True)
La fonction suivant permettra de réaliser des nuages de mots à partir d'un corpus
def plotWc(text, stopwords=None, title=''):
wc = WordCloud(
stopwords=stopwords,
width=800,
height=400,
max_words=1000,
random_state=44,
background_color="white",
collocations=False
).generate(text)
plt.figure(figsize = (10,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title(title)
plt.show()
plotWc(" ".join(positive_text_2), stopwords=stopwords.words('english'), title = "Wordcloud des tweets positifs")
Les tweets positifs sont a priori marqués par la forte reprétsentation de mots à connotation positive love, good, happy.
Cest a priori graphique peut être confirmé par un graphique de fréquence des mots individuels les plus présents
plot_freq_dist(create_corpus(positive_text_2), title = 'Most common words associated with positive tweets')
plot_freq_dist(create_corpus(positive_text_2), ngram=2, title = 'Most common 2grams associated with positive tweets')
plot_freq_dist(create_corpus(positive_text_2), ngram=3, title = 'Most common 3grams associated with positive tweets')
plot_freq_dist(create_corpus(positive_text_2), ngram=4, title = 'Most common 4grams associated with positive tweets')
[insight] : Une grande majorité de tweets positifs se rapportent soit à la fête des mère, soit au 4 Mai du fait du jeu de mot avec Star Wars...
Cette spécificité sera surement exploitée par les modèles comme un marqueur probable de tweets positifs.
plotWc(" ".join(neutral_text_2), stopwords=stopwords.words('english'), title = "Wordcloud des tweets neutres")
plot_freq_dist(create_corpus(neutral_text_2), title = 'Most common words associated with neutral tweets')
[Insight] On peut déjà remarquer que le mot day, qui est le plus fréquent des mots clefs des tweets positifs apparaît aussi en 6ème position des mots neutres.
plot_freq_dist(create_corpus(neutral_text_2), ngram=2, title = 'Most common 2grams associated with neutral tweets')
plot_freq_dist(create_corpus(neutral_text_2), ngram=3, title = 'Most common 3grams associated with neutral tweets')
plot_freq_dist(create_corpus(neutral_text_2), ngram=4, title = 'Most common 4grams associated with neutral tweets')
[insight] : On voit une source de confusion arriver avec les twwets neutres dans la mesure où une proportion significative de ceux-ci se rapportent aussi à la fête des mères et star wars.
plotWc(" ".join(negative_text_2), stopwords=stopwords.words('english'), title = "Wordcloud des tweets négatifs")
plot_freq_dist(create_corpus(negative_text_2), title = 'Most common words associated with negative tweets')
plot_freq_dist(create_corpus(negative_text_2), ngram=2, title = 'Most common 2grams associated with negative tweets')
plot_freq_dist(create_corpus(negative_text_2), ngram=3, title = 'Most common 3grams associated with negative tweets')
plot_freq_dist(create_corpus(negative_text_2), ngram=4, title = 'Most common 4grams associated with negative tweets')
[insight] : on observe l'utilisation de mots autocensurés (**) et de mots très chargés (hate)
Il ne servira à rien de tester des n-gram de dimension 4 ou plus : le nombre d'occurences est trop faible
def list_words_with(text_series, search='', nb=30):
'''
Cette fonction permet de lister les mots dans un string qui contiennent une certaine chaîne de caractères
inputs :
- text_series : un pd.Series contennat les chaînes de caractères
- search : la séquence à rechercher
- nb : ressortir les nb occurences les plus fréquentes
output :
- une liste de tuples contenant
+ le mot contenant la séquence recherchée
+ le nombre d'occurence dans text_series
'''
#searchPattern = f"\w*{search}\w*"
searchPattern = f"\w*{search}\w* "
cnt = Counter()
for tweet in text_series:
# Replace all URls with 'URL'
tweet = re.findall(searchPattern,tweet)
for word in tweet:
cnt[word] += 1
return cnt.most_common(nb)
#liste des mots incluant auto-censure **
list_words_with(negative_text_2, search='\*{2}')
#nombre d'utilisateurs
list_words_with(negative_text_2, search='USER')
#nombre d'URLs
list_words_with(negative_text_2, search='URL')
#liste des émojis
list_words_with(negative_text_2, search='EMO\w+')
#les mots qui incluents !!
list_words_with(negative_text_2, search='!!')
#les tweets complets qui incluent 'bs' (apparaît dans les 4grams)
list_words_with(negative_text_2, search='[\w ]* bs [\w ]*')
#listing des mots clefs
list_words_with(negative_text_2, search='#[(\w*|\d*)]+')
def user_names(text_list):
cnt = Counter()
for text in text_list:
for word in text.split():
if word.startswith('@'):
cnt[word] += 1
return cnt
user_names(positive_text)
user_names(positive_text_2)
user_names(negative_text_2)
user_names(neutral_text_2)
y_train = Preprocess_transform_target(y_train, columns_to_process=['sentiment'])
y_train.head()
y_val = Preprocess_transform_target(y_val, ['sentiment'])
y_val.head()
y_test = Preprocess_transform_target(y_test, ['sentiment'])
y_test.head()
On commence par définir une fonction générique qui sera en capacité d'ajuster, optimiser et logger dans MLFlow les résultats de pipelines qui seront produits pour chaque essai
La cellule suivante permet de créer des étapes de sélection de colonnes dans les Data Frame en entrée
Le mode de fonctionnement souhaité consiste à
from sklearn.base import BaseEstimator, TransformerMixin
class TextSelector(BaseEstimator, TransformerMixin):
def __init__(self, field):
self.field = field
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.field]
class NumberSelector(BaseEstimator, TransformerMixin):
def __init__(self, field):
self.field = field
def fit(self, X, y=None):
return self
def transform(self, X):
return X[[self.field]]
def score_estimator(
estimator, X_train, X_test, df_train, df_test, target_col
):
"""
Evaluate an estimator on train and test sets with different metrics
"""
metrics = [
("f1_macro", f1_score),
("precision_macro", precision_score),
("recall_macro", recall_score),
]
res = []
for subset_label, X, df in [
("train", X_train, df_train),
("test", X_test, df_test),
]:
y = df[target_col]
y_pred = estimator.predict(X)
for score_label, metric in metrics:
score = metric(y, y_pred, average='macro')
res.append(
{"subset": subset_label, "metric": score_label, "score": score}
)
res = (
pd.DataFrame(res)
.set_index(["metric", "subset"])
.score.unstack(-1)
.round(4)
.loc[:, ['train', 'test']]
)
return res
def scores_to_dict(score_df):
d = score_df['train'].to_dict()
d1 = dict(zip([x+'_train_' for x in list(d.keys())], list(d.values())))
d = score_df['test'].to_dict()
d2 = dict(zip([x+'_test' for x in list(d.keys())], list(d.values())))
d1.update(d2)
return d1
# Create function so that we could reuse later
def plot_cm(y_test, y_pred, target_names=[-1, 0, 1],
figsize=(5,3)):
"""Create a labelled confusion matrix plot."""
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=figsize)
sns.heatmap(cm, annot=True, fmt='g', cmap='BuGn', cbar=False,
ax=ax)
ax.set_title('Confusion matrix')
ax.set_xlabel('Predicted')
ax.set_xticklabels(target_names)
ax.set_ylabel('Actual')
ax.set_yticklabels(target_names,
fontdict={'verticalalignment': 'center'});
def target_params(pipe, dict_keyval):
"""
Crée un dictionnaire constitué de tous les paramètres incluant 'pattern' d'un pipe et leur assigne une valeur unique
"""
res={}
for key in list(dict_keyval.keys()):
target = "[a-zA-Z\_]+__" + key
rs = re.findall(target, ' '.join(list(pipe.get_params().keys())))
rs=dict.fromkeys(rs, dict_keyval[key])
res.update(rs)
return res
def trainPipelineMlFlow(mlf_XP,
xp_name_iter,
pipeline,
X_train, y_train, X_test, y_test,
target_col='sentiment',
fixed_params={},
use_opti=False, iterable_params={}, n_iter=20):
"""
Fonction générique permettant d'entrainer et d'optimiser un pipeline sklearn
Les paramètres et résultats sont stockés dans MLFlow
"""
mlflow.set_experiment(mlf_XP)
with mlflow.start_run(run_name=xp_name_iter):
start_time = time.monotonic()
warnings.filterwarnings("ignore")
# fit pipeline
pipeline.set_params(**fixed_params)
if not use_opti:
search = pipeline
else:
search = RandomizedSearchCV(estimator = pipeline,
param_distributions = iterable_params,
n_jobs = -1,
cv = 5,
scoring = 'f1_macro',
n_iter = n_iter)
search.fit(X_train, y_train[target_col])
# get params
params_to_log = fixed_params #select initial params
if use_opti:
params_to_log.update(search.best_params_) #update for optimal solution
mlflow.log_params(params_to_log)
# Evaluate metrics
y_pred=search.predict(X_test)
score = score_estimator(estimator=search,
X_train=X_train,
X_test=X_test,
df_train=y_train,
df_test=y_test,
target_col=target_col
)
# Print out metrics
print('XP :', xp_name_iter, '\n')
print('pipeline : \n', score, '\n')
print("params: \n" % params_to_log, '\n')
print("Confusion matrix: \n")
plot_cm(y_test, search.predict(X_test))
#r Report to MlFlow
mlflow.log_metrics(scores_to_dict(score))
mlflow.sklearn.log_model(pipeline, xp_name_iter)
end_time = time.monotonic()
elapsed_time = timedelta(seconds=end_time - start_time)
print('elapsed time :', elapsed_time)
mlflow.set_tag(key="elapsed_time", value=elapsed_time)
return search
bow_pipeline = Pipeline(
steps=[
('coltext', TextSelector('text')), #Sélection de la colonne à transformer (corpus)
("tfidf", TfidfVectorizer()),
("classifier", RandomForestClassifier(n_jobs=-1)),
]
)
list(bow_pipeline.get_params().keys())
trainPipelineMlFlow(
mlf_XP = "opti_F1",
xp_name_iter = "test",
pipeline = bow_pipeline,
X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
target_col = 'sentiment',
fixed_params = {'classifier__random_state':42}
)
params = {
"tfidf__use_idf": [True, False],
"tfidf__ngram_range": [(1, 1), (1, 2), (1,3)],
"classifier__bootstrap": [True, False],
"classifier__class_weight": ["balanced", None],
"classifier__n_estimators": [100, 300, 500, 800, 1200],
"classifier__max_depth": [5, 8, 15, 25, 30],
"classifier__min_samples_split": [2, 5, 10, 15, 100],
"classifier__min_samples_leaf": [1, 2, 5, 10]
}
trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="Bag Of Words - RF-Opti - n_iter_30",
pipeline=bow_pipeline,
X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
target_col='sentiment',
fixed_params={'classifier__random_state':42},
use_opti=True,
iterable_params=params,
n_iter=30
)
bow_pipeline_LR = Pipeline(
steps=[
('coltext', TextSelector('text')), #Sélection de la colonne à transformer (corpus)
("tfidf", TfidfVectorizer()),
("classifier", LogisticRegression(solver='liblinear', multi_class='auto')),
]
)
list(bow_pipeline_LR.get_params().keys())
params = {
"tfidf__use_idf": [True, False],
"tfidf__ngram_range": [(1, 1), (1, 2), (1,3)]
}
trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="Bag Of Words - LR-Opti - n_iter_30",
pipeline=bow_pipeline_LR,
X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
target_col='sentiment',
fixed_params={'classifier__random_state':42},
use_opti=True,
iterable_params=params,
n_iter=30
)
pipe = bow_pipeline_LR
params = target_params(pipe, {
"use_idf": [True, False],
"ngram_range": [(1, 1), (1, 2), (1,3), (1,4)]
})
trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="Bag Of Words - LR-Opti - n_iter_30",
pipeline = pipe,
X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
use_opti = True,
iterable_params = params,
n_iter = 30
)
pipe = bow_pipeline_LR_prepro
params = target_params(pipe, {
"use_idf": [True, False],
"ngram_range": [(1, 1), (1, 2), (1,3), (1,4)]
})
trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="Bag Of Words - LR-Opti - n_iter_30",
pipeline = pipe,
X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
use_opti = True,
iterable_params = params,
n_iter = 30
)
trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="Bag Of Words - LR",
pipeline=bow_pipeline_LR,
X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
target_col='sentiment',
fixed_params={'classifier__random_state':42}
)
from sklearn.base import BaseEstimator, TransformerMixin
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self,
apply_lemmatizer=True,
apply_lowercase=True,
apply_url_standerdisation=True,
apply_user_standerdisation=True,
apply_emoticon_to_words=True,
apply_stopwords_removal=True,
apply_shortwords_removal=True,
apply_non_alphabetical_removal=True,
apply_only_2_consecutive_charac=True):
self.apply_lemmatizer = apply_lemmatizer
self.apply_lowercase = apply_lowercase
self.apply_url_standerdisation = apply_url_standerdisation
self.apply_user_standerdisation = apply_user_standerdisation
self.apply_emoticon_to_words = apply_emoticon_to_words
self.apply_stopwords_removal = apply_stopwords_removal
self.apply_shortwords_removal = apply_shortwords_removal
self.apply_non_alphabetical_removal = apply_non_alphabetical_removal
self.apply_only_2_consecutive_charac = apply_only_2_consecutive_charac
def fit(self, X, y=None):
return self
def transform(self, X):
res= preprocess_text(X,
apply_lemmatizer = self.apply_lemmatizer,
apply_lowercase = self.apply_lowercase,
apply_url_standerdisation = self.apply_url_standerdisation,
apply_user_standerdisation = self.apply_user_standerdisation,
apply_emoticon_to_words = self.apply_emoticon_to_words,
apply_stopwords_removal = self.apply_stopwords_removal,
apply_shortwords_removal = self.apply_shortwords_removal,
apply_non_alphabetical_removal = self .apply_non_alphabetical_removal,
apply_only_2_consecutive_charac = self.apply_only_2_consecutive_charac
)
return res
bow_pipeline_LR_prepro = Pipeline(
steps=[
('coltext', TextSelector('text')), #Sélection de la colonne à transformer (corpus)
('prepro', TextPreprocessor()),
("tfidf", TfidfVectorizer()),
("classifier", LogisticRegression(solver='liblinear', multi_class='auto')),
]
)
list(bow_pipeline_LR_prepro.get_params().keys())
trainPipelineMlFlow(
mlf_XP = "DSA_Tweets",
xp_name_iter = "Bag Of Words - LRprepro",
pipeline = bow_pipeline_LR_prepro,
X_train = X_train , y_train = y_train , X_test = X_test , y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs': -1, 'random_state':42})
)
target_params(bow_pipeline_LR_prepro, {'n_jobs': -1, 'random_state':42})
pipe = bow_pipeline_LR_prepro
trainPipelineMlFlow(
mlf_XP = "DSA_Tweets",
xp_name_iter = "Bag Of Words - LRprepro",
pipeline = pipe,
X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs': -1, 'random_state':42, 'apply_emoticon_to_words':False})
)
params = target_params(pipe, {'apply_emoticon_to_words': [True, False] , 'apply_lemmatizer': [True, False], 'apply_lowercase': [True, False], 'apply_non_alphabetical_removal': [True, False], 'apply_shortwords_removal': [True, False], 'apply_stopwords_removal': [True, False], 'apply_url_standerdisation': [True, False], 'apply_user_standerdisation': [True, False] })
pipe = bow_pipeline_LR
params = target_params(pipe,
{"use_idf": [True, False]}
)
pipe = bow_pipeline_LR_prepro
trainPipelineMlFlow(
mlf_XP = "DSA_Tweets",
xp_name_iter = "Bag Of Words - LRprepro - Opti",
pipeline = pipe,
X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs': -1, 'random_state':42}),
use_opti = True,
iterable_params = params
)
pipe = bow_pipeline_LR_prepro
params = target_params(pipe, {'apply_emoticon_to_words': [True, False],
'apply_lemmatizer': [True, False],
'apply_lowercase': [True, False],
'apply_non_alphabetical_removal': [True, False],
'apply_shortwords_removal': [True, False],
'apply_stopwords_removal': [True, False],
'apply_url_standerdisation': [True, False],
'apply_user_standerdisation': [True, False]
})
params
pipe = bow_pipeline_LR_prepro
params = target_params(pipe, {
"use_idf": [True, False]
})
trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="Bag Of Words - LR-Opti - n_iter_30",
pipeline = pipe,
X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
use_opti = True,
iterable_params = params,
n_iter = 30
)
X_train_prepro = pd.DataFrame(preprocess_text(X_train['text']), columns=['text'])
X_train_prepro
X_test_prepro = pd.DataFrame(preprocess_text(X_test['text']), columns=['text'])
X_test
pipe = bow_pipeline_LR
params = target_params(pipe, {
"use_idf": [True, False]
})
trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="Bag Of Words - LR-prepro",
pipeline = pipe,
X_train = X_train_prepro, y_train = y_train, X_test = X_test_prepro, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
use_opti = True,
iterable_params = params,
n_iter = 30
)
import torch
torch.cuda.is_available()
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
# Preprocess text (username and link placeholders)
def preprocess(text):
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
model = AutoModelForSequenceClassification.from_pretrained('/mnt/pretrained_models/'+MODEL)
tokenizer = AutoTokenizer.from_pretrained('/mnt/pretrained_models/'+MODEL)
config = AutoConfig.from_pretrained('/mnt/pretrained_models/'+MODEL)
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
html = f.read().decode('utf-8').split("\n")
csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]
nlp=pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0, return_all_scores=True)
def TorchTwitterRoBERTa_Pred(text = "Good night 😊"):
text = preprocess(text)
otpt = nlp(text)[0]
# otpt = (list(otpt[i].values())[1] for i in range(len(otpt)))
neg = otpt[0]['score']
neu = otpt[1]['score']
pos = otpt[2]['score']
# NewName = {0:'roBERTa-neg', 1:'roBERTa-neu', 2:'roBERTa-pos'}
# otpt = pd.json_normalize(otpt).transpose().rename(columns=NewName).reset_index().drop([0]).drop(columns=['index'])
return neg, neu, pos
test = TorchTwitterRoBERTa_Pred()
test
def run_loopy_roBERTa(df):
v_neg, v_neu, v_pos = [], [], []
for _, row in df.iterrows():
v1, v2, v3 = TorchTwitterRoBERTa_Pred(row.values[0])
v_neg.append(v1)
v_neu.append(v2)
v_pos.append(v3)
df_result = pd.DataFrame({'roBERTa_neg': v_neg,
'roBERTa_neu': v_neu,
'roBERTa_pos': v_pos})
return df_result
class clTwitterroBERTa(BaseEstimator, TransformerMixin):
def __init__(self, field):
self.field = field
def fit(self, X, y=None):
return self
def transform(self, X):
res = run_loopy_roBERTa(X[[self.field]])
#self.res[['roBERTa_neg', 'roBERTa_neu', 'roBERTa_pos']] = X[self.field].apply(lambda x : TorchTwitterRoBERTa_Pred(x)).apply(pd.Series)
return res
#return self.res
roBERTa_pipe=Pipeline([
('roBERTa', clTwitterroBERTa(field='text'))
])
roBERTa_RF_Pipe = Pipeline(
steps=[
('roBERTa', roBERTa_pipe),
("classifier", RandomForestClassifier(n_jobs=-1))
]
)
pipe = roBERTa_RF_Pipe
trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="roBERTa - LR",
pipeline = pipe,
X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42})
)
import gc
gc.collect()
torch.cuda.empty_cache()
import torch
torch.cuda.empty_cache()
X_train_roBERTa = roBERTa_pipe.transform(X_train)
X_train_roBERTa
X_test_roBERTa = roBERTa_pipe.transform(X_test)
X_train_roBERTa.to_parquet('/mnt/data/interim/X_train_roBERTa.gzip',compression='gzip')
X_test_roBERTa.to_parquet('/mnt/data/interim/X_test_roBERTa.gzip',compression='gzip')
roBERTa_RF = Pipeline(
steps=[
("classifier", RandomForestClassifier(n_jobs=-1))
]
)
pipe = roBERTa_RF
params = target_params(pipe, {
"bootstrap": [True, False],
"class_weight": ["balanced", None],
"n_estimators": [100, 300, 500, 800, 1200],
"max_depth": [5, 8, 15, 25, 30],
"min_samples_split": [2, 5, 10, 15, 100],
"min_samples_leaf": [1, 2, 5, 10]
})
roBERTa_RF_=trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="roBERTa - RF - opti - 30",
pipeline = pipe,
X_train = X_train_roBERTa, y_train = y_train, X_test = X_test_roBERTa, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
use_opti = True,
iterable_params=params,
n_iter=30
)
class Blob(BaseEstimator, TransformerMixin):
def __init__(self, field):
self.field = field
def fit(self, X, y=None):
return self
def transform(self, X):
X[['polarity', 'subjectivity']] = X[self.field].apply(lambda x:TextBlob(x).sentiment).apply(pd.Series)
return X[['polarity', 'subjectivity']]
blob_pipe=Pipeline([
('blob', Blob(field='text'))
])
X_train_Blob=blob_pipe.transform(X_train)
X_train_Blob.head()
X_test_Blob=blob_pipe.transform(X_test)
X_test_Blob.head()
X_train_Blob.to_parquet('/mnt/data/interim/X_train_Blob.gzip',compression='gzip')
X_test_Blob.to_parquet('/mnt/data/interim/X_test_Blob.gzip',compression='gzip')
class Vader(BaseEstimator, TransformerMixin):
def __init__(self, field):
self.field = field
sid = SentimentIntensityAnalyzer()
def fit(self, X, y=None):
return self
def transform(self, X):
sid = SentimentIntensityAnalyzer()
X[['neg', 'neu', 'pos', 'compound']] = X[self.field].apply(sid.polarity_scores).apply(pd.Series)
return X[['neg', 'neu', 'pos', 'compound']]
vader_pipe=Pipeline([
('vader', Vader(field='text'))
])
X_train_Vader=vader_pipe.transform(X_train)
X_train_Vader.head()
X_test_Vader=vader_pipe.transform(X_test)
X_test_Vader.head()
X_train_Vader.to_parquet('/mnt/data/interim/X_train_Vader.gzip',compression='gzip')
X_test_Vader.to_parquet('/mnt/data/interim/X_test_Vader.gzip',compression='gzip')
X_train_compound = pd.concat([X_train_roBERTa, X_train_Blob, X_train_Vader], axis=1)
X_test_compound = pd.concat([X_test_roBERTa, X_test_Blob, X_test_Vader], axis=1)
X_train_compound.head()
X_test_compound.head()
pipe = roBERTa_RF
params = target_params(pipe, {
"bootstrap": [True, False],
"class_weight": ["balanced", None],
"n_estimators": [100, 300, 500, 800, 1200],
"max_depth": [5, 8, 15, 25, 30],
"min_samples_split": [2, 5, 10, 15, 100],
"min_samples_leaf": [1, 2, 5, 10]
})
roBERTa_RF_=trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="roBERTa_Blob_Vader - RF - opti - 30",
pipeline = pipe,
X_train = X_train_compound, y_train = y_train, X_test = X_test_compound, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
use_opti = True,
iterable_params=params,
n_iter=30
)
import xgboost as xgb
roBERTa_xgb = Pipeline(
steps=[
("classifier", xgb.XGBClassifier())
]
)
pipe = roBERTa_xgb
params = target_params(pipe, {
"eta" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
"max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
"min_child_weight" : [ 1, 3, 5, 7 ],
"gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
"colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
})
roBERTa_xgb_ = trainPipelineMlFlow(
mlf_XP="DSA_Tweets",
xp_name_iter="roBERTa - xgb - opti",
pipeline = pipe,
X_train = X_train_compound, y_train = y_train, X_test = X_test_compound, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
use_opti = True,
iterable_params=params,
n_iter=20
)
pipe = bow_pipeline
essai_=trainPipelineMlFlow(
mlf_XP="opti F1",
xp_name_iter="test",
pipeline = pipe,
X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test,
target_col = 'sentiment',
fixed_params = target_params(pipe, {'n_jobs':-1,'random_state':42}),
use_opti = False
)
essai_.predict_proba(X_train)
X_train.head()
for var in [-1, 0, 1]:
plt.figure(figsize=(12,4))
sns.distplot(essai_.predict_proba(X_train)[(y_train['sentiment']==var),0], bins=30, kde=False,
color='green', label='Negative')
sns.distplot(essai_.predict_proba(X_train)[(y_train['sentiment']==var),1], bins=30, kde=False,
color='red', label='Neutral')
sns.distplot(essai_.predict_proba(X_train)[(y_train['sentiment']==var),2], bins=30, kde=False,
color='blue', label='Positive')
plt.legend()
plt.title(f'Histogram of {var} by true sentiment');
Stratégie : on maximise le seuil pour la décision positive, puis sur les non positifs, on maximise le seuil pour les négatifs, le reste est neutre
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
return (pos_probs >= threshold).astype('int')
def find_optimal_f1_thresholds(pipe, X, y):
probs = pipe.predict_proba(X)
# keep probabilities for the positive outcome only
pos_probs = probs[:,2]
# define thresholds
thresholds = np.arange(0, 1, 0.001)
# evaluate each threshold
scores = [f1_score([(1 if i==1 else 0) for i in y ], to_labels(pos_probs, t)) for t in thresholds]
# get best threshold
ix = np.argmax(scores)
res = {'pos_threshold' : thresholds[ix], 'pos_f1' : scores[ix] }
# keep probabilities for the positive outcome only
neg_probs = probs[:,0]
# define thresholds
thresholds = np.arange(0, 1, 0.001)
# evaluate each threshold
scores = [f1_score([(1 if i==-1 else 0) for i in y ], to_labels(neg_probs, t)) for t in thresholds]
# get best threshold
ix = np.argmax(scores)
res.update({'neg_threshold' : thresholds[ix], 'neg_f1' : scores[ix] })
return res
thres = find_optimal_f1_thresholds(roBERTa_RF_, X_train_compound, y_train['sentiment'])
thres
y_train['sentiment']
roBERTa_RF_.predict_proba(X_train_compound)
def sentiment_predict(pipe, X, dict_thres):
seuil_pos=dict_thres['pos_threshold']
seuil_neg=dict_thres['neg_threshold']
probs = pipe.predict_proba(X)
y_test_pred_pos = to_labels(probs[:,2], seuil_pos)
y_test_pred_neg = to_labels(probs[:,0], seuil_neg)
y_test_pred = y_test_pred_pos
y_test_pred[(y_test_pred_pos==0)] = -y_test_pred_neg[(y_test_pred_pos==0)]
return y_test_pred
y_test_pred = sentiment_predict(roBERTa_RF_, X_test_compound,thres)
f1_score(y_test, y_test_pred, average='macro')
thres_xgb = find_optimal_f1_thresholds(roBERTa_xgb_, X_train_compound, y_train['sentiment'])
y_test_pred_xgb = sentiment_predict(roBERTa_xgb_, X_test_compound,thres_xgb)
f1_score(y_test, y_test_pred_xgb, average='macro')
import shap
shap.initjs()
import gensim.corpora as corpora# Create Dictionary
id2word = corpora.Dictionary(data_words)